#!/bin/bash
export VLLM_ENGINE_ITERATION_TIMEOUT_S=36000
export VLLM_RPC_TIMEOUT=36000000
export VLLM_ENFORCE_CUDA_GRAPH=1

XINFERENCE_MODEL_SRC=modelscope XINFERENCE_HOME=/xinference/xinference_cache xinference-local -H 0.0.0.0 &
while true; do
  if curl -s "http://localhost:9997" > /dev/null; then
    break
  else
    sleep 1
  fi
done

xinference launch --model-engine vllm --model-name deepseek-r1-distill-llama --size-in-billions 70 --model-format pytorch  --model_path /data/DeepSeek-R1-Distill-Llama-70B --n-gpu 8 --replica 2 --max_model_len $[1024*32] --enable_prefix_caching True --enable_chunked_prefill True --trust-remote-code 1 
#--enable_reasoning --reasoning_parser deepseek_r1

PID1=$!
wait $PID1
wait


